# 准备数据
import json
with open('pubtabnet/PubTabNet_2.0.0.jsonl','r') as f:
    line = f.readline()
    train = 0
    val = 0
    tt = 0
    train_list = []
    val_list = []
    while line and tt < 599999:
        js = json.loads(line)
        tt += 1
        if tt % 20000 == 0:
            print('Reading line ', tt)
        if js['split'] == 'train':
            train += 1
            train_list.append(line)
        elif js['split'] == 'val':
            val += 1
            val_list.append(line)
        line = f.readline()
with open('pubtabnet/train.jsonl','w') as fw:
    for l in train_list:
        fw.write(l)
with open('pubtabnet/val.jsonl','w') as fw:
    for l in val_list:
        fw.write(l)
print('dataset prepared!')
print('train: ', train, 'val: ', val, 'total: ', tt)